CP255 Final Project -

Relationship between Health, Urban Environs and Socioethnic Status

Cheng-Kai Hsu, PhD Student in CRP, 2025

Part 0: Module and data source
Part 1: CA-level data analysis and visualization
Part 2: SF-level data analasis and visualization
Part 3: Interactive mapping

PART 0:

Module import, data source description

In [129]:
# Import libraries
import numpy as np
import pandas as pd; import geopandas as gpd
import libpysal as ps

# visualization, basemap, interactive mapping
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from shapely.geometry import Point
import contextily as cx
import fiona
%matplotlib inline
import pydeck as pdk
import folium
import osmapi as osm
import osmnx as ox
from colorcet import fire


# For API
import requests
import json
import shapely
import pprint as pp


# pip install sodapy for downloading CalEnvScreen 3.0 data
from sodapy import Socrata


#For regression and GWR:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
import esda as esda
import splot as splot
from splot.libpysal import plot_spatial_weights
from splot.esda import moran_scatterplot, plot_moran
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import compare_surfaces, truncate_colormap



# ------- Data discription-------
# State-wide (CA): geographic, health status, environment and traffic, socio-economic status, ethnicity
# City-wide (SF): Physical activity (bike sharing, bike station, park, tree, sidewalk)

# Sources:
# https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-30
# https://www2.census.gov/geo/tiger/GENZ2018/shp/
# https://data.sfgov.org/resource/7jbp-yzp3.json (bike sharing)
# https://opendata.arcgis.com/datasets/b8ad067022374acd966db3026ddab6fc_0.geojson (bike rack)
# https://data.sfgov.org/Transportation/Map-of-Sidewalk-Widths/ygcm-bt3x (sidewalk) 


# ------- (Geo)Dataframe naming-------
# State level:
# df1: CA geographic data
# df2: CalEnviroScreen 3,0 data, including health outcome (e.g. asthma), environs (e.g. traffic, PM2.5, O3),
# socio-eco status (e.g. popverty, ethnicity), etc
# df3: merge df1 and df2

# City level:
# df5: sidewalk
# df6: bike 
# df7: bike sharing

# ------ presentation format -------
# I referenced the following webpage to produce the slideshow for the recorded presentation
# https://medium.com/learning-machine-learning/present-your-data-science-projects-with-jupyter-slides-75f20735eb0f

PART 1: CA-level

Data description, correlation analysis, visualzation

Data import

In [2]:
# import geo data for CA and name it df1
df1 = gpd.read_file('data/cb_2018_06_tract_500k/cb_2018_06_tract_500k.shp')
In [3]:
# display first 3 columns of df1
df1.head(3)
Out[3]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry
0 06 009 000300 1400000US06009000300 06009000300 3 CT 457009794 394122 POLYGON ((-120.76399 38.21389, -120.76197 38.2...
1 06 011 000300 1400000US06011000300 06011000300 3 CT 952744514 195376 POLYGON ((-122.50006 39.12232, -122.50022 39.1...
2 06 013 303102 1400000US06013303102 06013303102 3031.02 CT 6507019 0 POLYGON ((-121.72937 37.96884, -121.71409 37.9...
In [4]:
# plot the geo data
df1.plot(color='none', edgecolor='gray', linewidth=.2, figsize=(5,10))
Out[4]:
<AxesSubplot:>
In [5]:
# check the coordinate system
df1.crs
Out[5]:
<Geographic 2D CRS: EPSG:4269>
Name: NAD83
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: North America - NAD83
- bounds: (167.65, 14.92, -47.74, 86.46)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich
In [6]:
# import CalEnvScreen 3.0 Data and name it df2, by following the instructions provided by the data source
# https://www.opendatanetwork.com/dataset/www.transparentrichmond.org/wi9v-3g3n
client = Socrata("www.transparentrichmond.org", None)

# Returned as JSON from API 
results = client.get("wi9v-3g3n", limit=20000)

# Convert to pandas DataFrame
df2 = pd.DataFrame.from_records(results)
WARNING:root:Requests made without an app_token will be subject to strict throttling limits.
In [7]:
# display data, including health outcome (e.g. asthma), environs (e.g. traffic, PM2.5, O3, diesel PM),
# socioethnic status (e.g. popverty, education level, ethnicity)
df2.head(2)
Out[7]:
the_geom tract pop2010 california zip city longitude latitude ciscore ciscorep ... african_am native_ame asian_amer other_pct objectid_1 cidecile civigintil shape_leng shape_area ces2018_rn
0 {'type': 'MultiPolygon', 'coordinates': [[[[-1... 6083002103.0 3930 Santa Barbara 93454 Santa Maria -120.4270588 34.9306689 29.51 59 ... 1.9 0.5 7.2 1.6 3507 6 12 6999.35762193 2847611.28417 55-60%
1 {'type': 'MultiPolygon', 'coordinates': [[[[-1... 6083002402.0 11406 Santa Barbara 93455 Santa Maria -120.4780833 34.9287963 33.17 65 ... 1.4 0.2 5.5 1.6 2733 7 14 19100.5780032 16352920.1391 65-70%

2 rows × 71 columns

In [8]:
# display all the columns (i.e. variables)
df2.columns
Out[8]:
Index(['the_geom', 'tract', 'pop2010', 'california', 'zip', 'city',
       'longitude', 'latitude', 'ciscore', 'ciscorep', 'ozone', 'ozonep', 'pm',
       'pmp', 'diesel', 'dieselp', 'drink', 'drinkp', 'pest', 'pestp',
       'rseihaz', 'rseihazp', 'traffic', 'trafficp', 'cleanups', 'cleanupsp',
       'gwthreats', 'gwthreatsp', 'haz', 'hazp', 'iwb', 'iwbp', 'swis',
       'swisp', 'pollution', 'pollutions', 'pollutionp', 'asthma', 'asthmap',
       'lbw', 'lbwp', 'cvd', 'cvdp', 'edu', 'edup', 'ling', 'lingp', 'pov',
       'povp', 'unemp', 'unempp', 'housingb', 'housingbp', 'popchar',
       'popcharsco', 'popcharp', 'children_u', 'pop_11_64_', 'elderly_ov',
       'hispanic_p', 'white_pct', 'african_am', 'native_ame', 'asian_amer',
       'other_pct', 'objectid_1', 'cidecile', 'civigintil', 'shape_leng',
       'shape_area', 'ces2018_rn'],
      dtype='object')

The variables of interest to this project include geographic data ('tract' and 'zip') as well as, those variables within four main dimensions, health status ('asthma','lbw','cvd'), environment and traffic ('traffic','diesel,'pm'), socio-economic status ('edu','ling','pov') and ethnicity ('white_pct','african_am', 'asian_amer'). For each variable employed in this project, the description is listed as follows:

'zip':Postal ZIP Code that the census tract falls within

'asthma': Age-adjusted rate of emergency department visits for asthma.

'lbw': Percent low birth weight

'cvd': Percent low birth weight

'traffic': Traffic density, in vehicle-kilometers per hour per road length, within 150 meters of the census tract boundary

'diesel': Diesel PM emissions from on-road and non-road sources

'pm': Annual mean PM 2.5 concentrations

'edu': Percent of population over 25 with less than a high school education

'ling': Percent limited English speaking households

'pov': Percent of population living below two times the federal poverty level

'white_pct': Percent of white households

'african_am': Percent of african households

'asian_amer': Percent of asian households

The description for other variables can be found via the link: https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-30.

In [9]:
# Extract variables of interest
df2=df2[['tract','zip','pm','traffic', 'diesel','asthma','lbw','cvd','edu','ling','pov','white_pct','african_am','asian_amer']]
In [10]:
# (see the following two lines)
# Found two already imported dataset use different naming of polygon identification 
In [11]:
df1.GEOID[:5]
Out[11]:
0    06009000300
1    06011000300
2    06013303102
3    06013303202
4    06013303203
Name: GEOID, dtype: object
In [12]:
df2.tract[:5]
Out[12]:
0    6083002103.0
1    6083002402.0
2    6083002102.0
3    6083002010.0
4    6083002009.0
Name: tract, dtype: object
In [13]:
# For df2, remove the last two character (e.g. ".0") in the string
for i in range(len(df2['tract'])):
    if df2['tract'].iloc[i].find('.') != -1:
        df2['tract'].iloc[i] = df2['tract'].iloc[i][:-2]
        
df2['tract'][:5]
Out[13]:
0    6083002103
1    6083002402
2    6083002102
3    6083002010
4    6083002009
Name: tract, dtype: object
In [14]:
# For df1, remove the first character (i.e. "0")
df1.GEOID=df1.GEOID.map(lambda x: x[1:])
df1['GEOID'][:5]
Out[14]:
0    6009000300
1    6011000300
2    6013303102
3    6013303202
4    6013303203
Name: GEOID, dtype: object
In [15]:
# merge the aforementioned two dataset (i.e. df1 and df2) as a new dataframe called df3
df3=pd.merge(df1, df2, left_on='GEOID', right_on='tract')
df3.head(2)
Out[15]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... diesel asthma lbw cvd edu ling pov white_pct african_am asian_amer
0 06 009 000300 1400000US06009000300 6009000300 3 CT 457009794 394122 POLYGON ((-120.76399 38.21389, -120.76197 38.2... ... 0.26 58.33 4.04 7.49 8.4 0.7 28.0 83.9 1.1 1.1
1 06 011 000300 1400000US06011000300 6011000300 3 CT 952744514 195376 POLYGON ((-122.50006 39.12232, -122.50022 39.1... ... 1.53 30.94 4.83 9.47 41.5 12.5 39.8 26.6 0.9 2.0

2 rows × 24 columns

In [16]:
# (See the following two lines)
# make some thematic maps with columns 'traffic density' and 'households living with poverty'
In [17]:
df3["traffic"] = pd.to_numeric(df3["traffic"], downcast="float")

base = df3.plot(color='none', edgecolor='gray', linewidth=.2, figsize=(4,3))
df3.plot(ax=base, column='traffic',legend=True)

# The result is a bit odd, which would be solved afterwards
Out[17]:
<AxesSubplot:>
In [18]:
df3["pov"] = pd.to_numeric(df3["pov"], downcast="float")

base = df3.plot(color='none', edgecolor='gray', linewidth=.2, figsize=(4,3))
df3.plot(ax=base, column='pov',legend=True)

# The result makes sense and I make a map for each variable in the next line of code and provide some descriptions.
Out[18]:
<AxesSubplot:>

CA-level Chloropleth Maps

In [19]:
# Plot all variables with chloropleth maps for 12 variables within the four dimensions 
# (i.e. health, socioeconomic, ethnicity, environs)
# cannot see a clear pattern of traffic density 

f,(ax1,ax2,ax3) = plt.subplots(3,4,figsize=(30,16))
f.suptitle("Chloropleth Maps", fontsize=32)

    # Health
df3["cvd"] = pd.to_numeric(df3["cvd"], downcast="float")
df3["lbw"] = pd.to_numeric(df3["lbw"], downcast="float")
df3["asthma"] = pd.to_numeric(df3["asthma"], downcast="float")

    # Socio-economic
df3["pov"] = pd.to_numeric(df3["pov"], downcast="float")
df3["edu"] = pd.to_numeric(df3["edu"], downcast="float")
df3["ling"] = pd.to_numeric(df3["ling"], downcast="float")

    # Ethnicity
df3["white_pct"] = pd.to_numeric(df3["white_pct"], downcast="float")
df3["asian_amer"] = pd.to_numeric(df3["asian_amer"], downcast="float")
df3["african_am"] = pd.to_numeric(df3["african_am"], downcast="float")

    # Environs
df3["traffic"] = pd.to_numeric(df3["traffic"], downcast="float")
df3["pm"] = pd.to_numeric(df3["pm"], downcast="float")
df3["diesel"] = pd.to_numeric(df3["diesel"], downcast="float")


df3.plot(ax=ax1[0], column='cvd',legend=True, cmap="cet_CET_L8")
ax1[0].set_title("Cardiovascular Disease", fontsize=16)

df3.plot(ax=ax2[0], column='asthma',legend=True, cmap="cet_CET_L8")
ax2[0].set_title("Asthma", fontsize=16)

df3.plot(ax=ax3[0], column='lbw',legend=True, cmap="cet_CET_L8")
ax3[0].set_title("Low birth weight", fontsize=16)



df3.plot(ax=ax1[1], column='white_pct',legend=True, cmap='cet_fire')
ax1[1].set_title("% White Pop.", fontsize=16)

df3.plot(ax=ax2[1], column='asian_amer',legend=True, cmap='cet_fire')
ax2[1].set_title("% Asian American Pop.", fontsize=16)

df3.plot(ax=ax3[1], column='african_am',legend=True, cmap='cet_fire')
ax3[1].set_title("% African American Pop.", fontsize=16)



df3.plot(ax=ax1[2], column='pov',legend=True, cmap='cet_CET_D6')
ax1[2].set_title("poverty", fontsize=16)

df3.plot(ax=ax2[2], column='edu',legend=True, cmap='cet_CET_D6')
ax2[2].set_title("# below High School Edu", fontsize=16)

df3.plot(ax=ax3[2], column='ling',legend=True, cmap='cet_CET_D6')
ax3[2].set_title("Language barrier", fontsize=16)



df3.plot(ax=ax1[3], column='traffic',legend=True,cmap='cet_CET_D4')
ax1[3].set_title("traffic volume", fontsize=16)

df3.plot(ax=ax2[3], column='pm',legend=True, cmap='cet_CET_D4')
ax2[3].set_title("PM2.5", fontsize=16)

df3.plot(ax=ax3[3], column='diesel',legend=True, cmap='cet_CET_D4')
ax3[3].set_title("Diesel PM", fontsize=16);

Descriptive statistics (box plots and histograms)

In [20]:
f=plt.figure(figsize=(18, 6))
f.suptitle("Box Plots", fontsize=24)

plt.subplot(261)
plt.boxplot(df3.cvd)
plt.title('Cardiovascular disease', fontsize=10)

plt.subplot(262)
plt.boxplot(df3.lbw)
plt.title('Low birth weight', fontsize=10)

plt.subplot(263)
plt.boxplot(df3.asthma)
plt.title('Asthma', fontsize=10)

plt.subplot(264)
plt.boxplot(df3.pov)
plt.title('poverty', fontsize=10)

plt.subplot(265)
plt.boxplot(df3.edu)
plt.title('Education level', fontsize=10)

plt.subplot(266)
plt.boxplot(df3.ling)
plt.title('Linguistic barrier', fontsize=10)

plt.subplot(267)
plt.boxplot(df3.white_pct)
plt.title('% White households', fontsize=10)

plt.subplot(268)
plt.boxplot(df3.asian_amer)
plt.title('% Asian Am. households', fontsize=10)

plt.subplot(269)
plt.boxplot(df3.african_am)
plt.title('% African Am. households', fontsize=10)

plt.subplot(2,6,10)
plt.boxplot(df3.traffic)
plt.title('Traffic density', fontsize=10)

plt.subplot(2,6,11)
plt.boxplot(df3.diesel)
plt.title('Diesel PM', fontsize=10)

plt.subplot(2,6,12)
plt.boxplot(df3.pm)
plt.title('PM2.5 concentration', fontsize=10);
In [21]:
df3['traffic'].describe()
Out[21]:
count     8034.000000
mean       936.348999
std        907.578857
min          0.000000
25%        437.057495
50%        695.369995
75%       1184.949982
max      45687.871094
Name: traffic, dtype: float64
In [22]:
# remove the outliers in 'traffic density'
low=df3['traffic'].quantile(.01)
high=df3['traffic'].quantile(.99)
high
Out[22]:
3672.8413183593752
In [23]:
df3_f = df3[(df3['traffic'] < high) & (df3['traffic'] > low)]
df3_f['traffic'].describe()
Out[23]:
count    7872.000000
mean      905.971558
std       672.920044
min        64.830002
25%       441.995003
50%       695.369995
75%      1170.357452
max      3671.219971
Name: traffic, dtype: float64
In [24]:
f=plt.figure(figsize=(18, 6))
f.suptitle("Box Plots", fontsize=24)

plt.subplot(261)
plt.boxplot(df3.cvd)
plt.title('Cardiovascular disease', fontsize=10)

plt.subplot(262)
plt.boxplot(df3.lbw)
plt.title('Low birth weight', fontsize=10)

plt.subplot(263)
plt.boxplot(df3.asthma)
plt.title('Asthma', fontsize=10)

plt.subplot(264)
plt.boxplot(df3.pov)
plt.title('poverty', fontsize=10)

plt.subplot(265)
plt.boxplot(df3.edu)
plt.title('Education level', fontsize=10)

plt.subplot(266)
plt.boxplot(df3.ling)
plt.title('Linguistic barrier', fontsize=10)

plt.subplot(267)
plt.boxplot(df3.white_pct)
plt.title('% White households', fontsize=10)

plt.subplot(268)
plt.boxplot(df3.asian_amer)
plt.title('% Asian Am. households', fontsize=10)

plt.subplot(269)
plt.boxplot(df3.african_am)
plt.title('% African Am. households', fontsize=10)

plt.subplot(2,6,10)
plt.boxplot(df3_f.traffic)
plt.title('Traffic density', fontsize=10)

plt.subplot(2,6,11)
plt.boxplot(df3.diesel)
plt.title('Diesel PM', fontsize=10)

plt.subplot(2,6,12)
plt.boxplot(df3.pm)
plt.title('PM2.5 concentration', fontsize=10);

plt.show();
In [25]:
# another way for descriptive statistics, i.e. histograms for observing data distribution  

fig, ax = plt.subplots(2, 6,figsize=(20,5))
fig.suptitle("Histograms", fontsize=20)

m=12
# starting from 'pm' that is indexed as 12
for i in range(2):
    for j in range(6):
        df3_f.hist(column = df3_f.columns[m], ax=ax[i,j], figsize=(20, 18))
        m+=1

;
Out[25]:
''
In [26]:
# We have three geo-levels, i.e. tract, zip, city (SF).
# For data reduction, I used the groupby operation to produce mean values of some variables for every zip code
df3_1=df3_f.groupby(df3_f['zip']).mean()
df3_1.head()
Out[26]:
ALAND AWATER pm traffic diesel asthma lbw cvd edu ling pov white_pct african_am asian_amer
zip
32 3.003961e+09 20663794.0 6.32 117.139999 0.13 27.590000 3.12 5.72 26.0 12.1 45.500000 52.599998 0.6 0.6
35 3.191027e+09 7990907.0 4.88 87.540001 0.12 30.620001 0.00 10.97 8.4 0.5 39.099998 89.400002 0.6 0.4
40 2.163200e+09 35550930.0 5.26 106.870003 0.09 39.930000 6.73 11.68 10.3 0.0 35.500000 88.000000 0.4 1.0
48 1.193827e+09 10910635.0 3.63 76.430000 0.35 34.770000 3.59 7.17 21.0 0.8 56.000000 73.800003 1.0 0.7
51 1.219176e+09 26378126.0 6.28 154.729996 0.20 31.760000 3.50 6.55 5.4 0.0 31.000000 87.900002 0.2 0.7
In [27]:
len(df3)
Out[27]:
8034
In [28]:
# Reduce data from 8034 to 1334 rows
len(df3_1)
Out[28]:
1334
In [29]:
# white people has low lingustic barrier, as expected
lm=sns.lmplot("white_pct", "ling", data=df3_1, fit_reg=True, scatter=True, scatter_kws={"s": .1})
lm.fig.set_figwidth(3)
lm.fig.set_figheight(3)
In [30]:
# lower educational level positively correlate with low lingustic barrier, as expected
lm=sns.lmplot("edu", "ling", data=df3_1,scatter_kws={"s": .1})
lm.fig.set_figwidth(3)
lm.fig.set_figheight(3)
In [31]:
# diesel PM positively correlate with traffic that is one of the main sources of the diesel PM
lm=sns.lmplot("diesel", "traffic", data=df3_1,scatter_kws={"s": .1})
lm.fig.set_figwidth(3)
lm.fig.set_figheight(3)
In [32]:
# less affluent households tend to have health issue concerning low birth weight
lm=sns.lmplot("pov", "lbw", data=df3_1,scatter_kws={"s": 1})
lm.fig.set_figwidth(3)
lm.fig.set_figheight(3)
In [33]:
# places with more white households tend not to have asthmatic issue
lm=sns.lmplot("white_pct", "asthma", data=df3_1,scatter_kws={"s": 1})
lm.fig.set_figwidth(3)
lm.fig.set_figheight(3)

CA-level correlation analysis results

In [34]:
dfData = df3_1.iloc[:,2:].corr()
plt.subplots(figsize=(8, 8)) 
sns.heatmap(dfData, annot=True,vmax=1, vmin=-1, square=False, cmap="RdBu", alpha=.9, linewidths=2, linecolor='white', cbar=True)
plt.suptitle('Correlation Analysis', fontsize=20);
In [35]:
dfData = df3_1.iloc[:,5:11].corr()
plt.subplots(figsize=(6, 6)) 
mask = np.zeros_like(dfData)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dfData, annot=True,vmax=1, vmin=-1,mask=mask, square=False, cmap="RdBu", alpha=.9, linewidths=2, linecolor='white', cbar=True)
plt.suptitle('Correlation Analysis', fontsize=15);
In [36]:
# tracts with more white households tend not to bear social and health burdens
sns.pairplot(df3_1,vars=["edu","ling","pov",'asthma',"white_pct"],height=1,diag_kind="kde",corner=True,plot_kws=dict(marker="+", linewidth=.1))
Out[36]:
<seaborn.axisgrid.PairGrid at 0x1a4e6e85d0>

PART 2: SF-level

Global regression, GWR, result mapping

Data extraction

In [37]:
# Extract data with the label '075' in the column 'COUNTYFP' and display the first two rows.
SF = df3[df3['COUNTYFP']=='075']
SF.head(2)
Out[37]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... diesel asthma lbw cvd edu ling pov white_pct african_am asian_amer
526 06 075 010200 1400000US06075010200 6075010200 102 CT 515958 295388 POLYGON ((-122.42667 37.80964, -122.42488 37.8... ... 73.970001 43.160000 4.42 4.39 1.6 1.1 12.1 81.099998 0.6 10.200000
527 06 075 011200 1400000US06075011200 6075011200 112 CT 177414 0 POLYGON ((-122.41629 37.79389, -122.41522 37.7... ... 111.129997 27.790001 5.13 3.71 10.6 13.1 26.0 55.099998 1.5 35.099998

2 rows × 24 columns

In [38]:
# There are 195 data points left.
len(SF)
Out[38]:
195
In [39]:
# Plot the geodataframe SF
SF.plot(color='none', edgecolor='gray', linewidth=.2, figsize=(5,10))
Out[39]:
<AxesSubplot:>
In [40]:
# Make box plots again to make sure no erroneous result
# No outliers found
In [41]:
plt.figure(figsize=(16, 4))

plt.subplot(261)
plt.boxplot(df3.cvd)
plt.title('Cardiovascular disease', fontsize=10)

plt.subplot(262)
plt.boxplot(df3.lbw)
plt.title('Low birth weight', fontsize=10)

plt.subplot(263)
plt.boxplot(df3.asthma)
plt.title('Asthma', fontsize=10)

plt.subplot(264)
plt.boxplot(df3.pov)
plt.title('poverty', fontsize=10)

plt.subplot(265)
plt.boxplot(df3.edu)
plt.title('Education level', fontsize=10)

plt.subplot(266)
plt.boxplot(df3.ling)
plt.title('Linguistic barrier', fontsize=10)

plt.subplot(267)
plt.boxplot(df3.white_pct)
plt.title('% White households', fontsize=10)

plt.subplot(268)
plt.boxplot(df3.asian_amer)
plt.title('% Asian Am. households', fontsize=10)

plt.subplot(269)
plt.boxplot(df3.african_am)
plt.title('% African Am. households', fontsize=10)

plt.subplot(2,6,10)
plt.boxplot(df3_f.traffic)
plt.title('Traffic density', fontsize=10)

plt.subplot(2,6,11)
plt.boxplot(df3.diesel)
plt.title('Diesel vehicle', fontsize=10)

plt.subplot(2,6,12)
plt.boxplot(df3.pm)
plt.title('PM2.5 concentration', fontsize=10);

SF-level correlation analysis results

In [42]:
dfData = SF[['pm','traffic','diesel','asthma','lbw','cvd','edu','ling','pov','white_pct','african_am','asian_amer']].corr()
plt.subplots(figsize=(8,8)) 

sns.heatmap(dfData, annot=True, vmax=1, square=False, cmap="RdBu", alpha=.9, linewidths=2, linecolor='white', cbar=True)
plt.show()
In [43]:
dfData = SF[['edu','ling','pov','white_pct','asian_amer','african_am']].corr()
plt.subplots(figsize=(4, 4)) 

sns.heatmap(dfData, annot=True, vmax=1, square=False, cmap="RdBu", alpha=.9, linewidths=2, linecolor='white', cbar=True)
plt.show()

SF-level Chloropleth maps

In [44]:
f,(ax1,ax2,ax3) = plt.subplots(3,4,figsize=(30,16))
f.suptitle("Chloropleth Maps", fontsize=32)

# # Health
# SF["cvd"] = pd.to_numeric(SF["cvd"], downcast="float")
# SF["lbw"] = pd.to_numeric(SF["lbw"], downcast="float")
# SF["asthma"] = pd.to_numeric(SF["asthma"], downcast="float")

# # Socio-economic
# SF["pov"] = pd.to_numeric(SF["pov"], downcast="float")
# df3["edu"] = pd.to_numeric(SF["edu"], downcast="float")
# df3["ling"] = pd.to_numeric(SF["ling"], downcast="float")

# # Ethnicity
# df3["white_pct"] = pd.to_numeric(df3["white_pct"], downcast="float")
# df3["asian_amer"] = pd.to_numeric(df3["asian_amer"], downcast="float")
# df3["african_am"] = pd.to_numeric(df3["african_am"], downcast="float")

# # Environs
# df3["traffic"] = pd.to_numeric(df3["traffic"], downcast="float")
# df3["pm"] = pd.to_numeric(df3["pm"], downcast="float")
# df3["diesel"] = pd.to_numeric(df3["diesel"], downcast="float")


SF.plot(ax=ax1[0], column='cvd',legend=True, cmap="cet_fire")
ax1[0].set_title("Cardiovascular Disease", fontsize=16)

SF.plot(ax=ax2[0], column='asthma',legend=True, cmap="cet_fire")
ax2[0].set_title("Asthma", fontsize=16)

SF.plot(ax=ax3[0], column='lbw',legend=True, cmap="cet_fire")
ax3[0].set_title("Low birth weight", fontsize=16)



SF.plot(ax=ax1[1], column='pov',legend=True, cmap='cet_CET_L17_r')
ax1[1].set_title("poverty", fontsize=16)

SF.plot(ax=ax2[1], column='edu',legend=True, cmap='cet_CET_L17_r')
ax2[1].set_title("# below High School Edu", fontsize=16)

SF.plot(ax=ax3[1], column='ling',legend=True, cmap='cet_CET_L17_r')
ax3[1].set_title("Language barrier", fontsize=16)



SF.plot(ax=ax1[2], column='white_pct',legend=True, cmap='copper')
ax1[2].set_title("% White Pop.", fontsize=16)

SF.plot(ax=ax2[2], column='asian_amer',legend=True, cmap='copper')
ax2[2].set_title("% Asian American Pop.", fontsize=16)

SF.plot(ax=ax3[2], column='african_am',legend=True, cmap='copper')
ax3[2].set_title("% African American Pop.", fontsize=16)


SF.plot(ax=ax1[3], column='traffic',legend=True,cmap='cet_CET_D6_r')
ax1[3].set_title("traffic volume", fontsize=16)

SF.plot(ax=ax2[3], column='pm',legend=True, cmap='cet_CET_D6_r')
ax2[3].set_title("PM2.5", fontsize=16)

SF.plot(ax=ax3[3], column='diesel',legend=True, cmap='cet_CET_D6_r')
ax3[3].set_title("Diesel cars", fontsize=16);

Adding another dimension of variables concerning physical activity (walking/biking facilities)

1. Sidewalk

In [45]:
# load sidewalk data
df5 = pd.read_csv("data/MTA.sidewalk_widths.csv")
df5.head(2)
Out[45]:
CNN STREET ST_TYPE CLASS WIDTH_MIN WIDTH_RECO SIDEWALK_F SIDE shape
0 105000.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.39702 37.78933, -122.39656 37...
1 104002.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.397255 37.789516, -122.39702 ...
In [46]:
# 16174 raws in total
len(df5)
Out[46]:
16174
In [47]:
# the shape column is not really the "shape" for geopandas
type(df5['shape'][0])
Out[47]:
str
In [48]:
# try to transform the string type to shapely.geometry.linestring type 
# print only one result for checking
shapely.wkt.loads(df5['shape'][0])
Out[48]:
In [49]:
# transforming the data type for all rows
df5['shape_'] = df5['shape'].apply(lambda x: shapely.wkt.loads(x))
In [50]:
df5.head(3)
Out[50]:
CNN STREET ST_TYPE CLASS WIDTH_MIN WIDTH_RECO SIDEWALK_F SIDE shape shape_
0 105000.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.39702 37.78933, -122.39656 37... LINESTRING (-122.39702 37.78933, -122.39656 37...
1 104002.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.397255 37.789516, -122.39702 ... LINESTRING (-122.397255 37.789516, -122.39702 ...
2 104001.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.39756 37.789757, -122.397255 ... LINESTRING (-122.39756 37.789757, -122.397255 ...
In [51]:
# transforming dataframe (df5) to GEOdataframe (gdf5)
gdf5 = gpd.GeoDataFrame(df5, crs=df1.crs, geometry=df5['shape_'])
gdf5.head(2)
Out[51]:
CNN STREET ST_TYPE CLASS WIDTH_MIN WIDTH_RECO SIDEWALK_F SIDE shape shape_ geometry
0 105000.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.39702 37.78933, -122.39656 37... LINESTRING (-122.39702 37.78933, -122.39656 37... LINESTRING (-122.39702 37.78933, -122.39656 37...
1 104002.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.397255 37.789516, -122.39702 ... LINESTRING (-122.397255 37.789516, -122.39702 ... LINESTRING (-122.39726 37.78952, -122.39702 37...
In [52]:
# displaying the sidewalk width range
gdf5['SIDEWALK_F'].describe()
Out[52]:
count    16174.000000
mean         8.652776
std         30.897292
min         -3.000000
25%          0.000000
50%         10.000000
75%         15.000000
max       1915.000000
Name: SIDEWALK_F, dtype: float64
In [53]:
# filter out those less than 15m (below 75% perceptile)
base = SF.plot(color='none', edgecolor='r', linewidth=.1, figsize=(6,6))
gdf5[gdf5['SIDEWALK_F']>=15].plot(ax=base,edgecolor='gray', linewidth=.2, figsize=(5,5), legend=True)
Out[53]:
<AxesSubplot:>
In [54]:
# intersect SF (main dataset) with gdf5 (sidewalk data) and create a new geodataframe called sidewalk
sidewalk=gpd.overlay(gdf5[gdf5['SIDEWALK_F']>=15], SF, how='intersection')
In [55]:
# now every sidwalk data point have a reference column for "groupby"ing back 
sidewalk.head(1)
Out[55]:
CNN STREET ST_TYPE CLASS WIDTH_MIN WIDTH_RECO SIDEWALK_F SIDE shape shape_ ... asthma lbw cvd edu ling pov white_pct african_am asian_amer geometry
0 105000.0 01ST ST Downtown Commercial 10.0 12.0 15.0 W LINESTRING (-122.39702 37.78933, -122.39656 37... LINESTRING (-122.39702 37.78933, -122.39656 37... ... 21.389999 5.85 3.79 4.6 8.7 19.0 54.700001 3.7 30.9 LINESTRING (-122.39702 37.78933, -122.39656 37...

1 rows × 34 columns

In [56]:
sidewalk['sidewalk_length']=sidewalk.length
# calculating sidewalk length and store the values to a new column called sidewalk_length
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  """Entry point for launching an IPython kernel.
In [57]:
# "groupby"ing back using "TRACTCE"
s=sidewalk.groupby('TRACTCE').sum()
In [58]:
len(SF)
Out[58]:
195
In [59]:
# there are 16 census tracts without any sidewalk, because there should be 195 rows
len(s)
Out[59]:
179
In [60]:
# Merge only the column 'sidewalk_length' back to SF
SF = SF.merge(s['sidewalk_length'],how='left',  left_on='TRACTCE', right_index=True)
In [61]:
SF.plot(column='sidewalk_length',legend=True)
Out[61]:
<AxesSubplot:>
In [62]:
# Show the null values in the 'sidewalk_length' column
SF[SF['sidewalk_length'].isnull()].head(2)
Out[62]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... asthma lbw cvd edu ling pov white_pct african_am asian_amer sidewalk_length
528 06 075 012401 1400000US06075012401 6075012401 124.01 CT 91972 0 POLYGON ((-122.41771 37.78424, -122.41607 37.7... ... 88.760002 5.63 6.05 31.400000 28.700001 71.199997 24.5 9.8 25.400000 NaN
532 06 075 017801 1400000US06075017801 6075017801 178.01 CT 214052 0 POLYGON ((-122.40493 37.78150, -122.40271 37.7... ... 68.430000 7.48 7.92 36.299999 55.599998 58.299999 25.6 3.2 64.599998 NaN

2 rows × 25 columns

In [63]:
# fill up the null values with zeros 
SF['sidewalk_length'].fillna(0, inplace=True)
In [64]:
# check the null values have been filled with zeros
SF[SF['sidewalk_length'].isnull()]
Out[64]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... asthma lbw cvd edu ling pov white_pct african_am asian_amer sidewalk_length

0 rows × 25 columns

In [65]:
# display the data again
# found that sidewalk length is greater in western part of SF
SF.plot(column='sidewalk_length',legend=True)
Out[65]:
<AxesSubplot:>

2. Bike rack

In [66]:
# load bike rack data
# looking at the json format for how I should extract data I need
url = "https://data.sfgov.org/resource/hn4j-6fx5.geojson"

response = requests.get(url)
results = response.text
data = json.loads(results)
data['features'][0]
Out[66]:
{'type': 'Feature',
 'geometry': {'type': 'Point', 'coordinates': [-122.40533, 37.788067]},
 'properties': {':@computed_region_6qbp_sg9q': '19',
  ':@computed_region_h4ep_8xdi': None,
  'location': 'Hermes',
  ':@computed_region_26cr_cadq': '3',
  'objectid': '12163',
  'globalid': None,
  ':@computed_region_ajp5_b2md': '8',
  'placement': 'SIDEWALK',
  ':@computed_region_qgnn_b9vv': '6',
  'install_yr': '2004',
  'install_mo': '0',
  'address': '125 GRANT AVE',
  'lon': '-122.405341',
  'street': 'GRANT',
  'racks': '1',
  ':@computed_region_jwn9_ihcz': '19',
  ':@computed_region_9jxd_iqea': '5',
  ':@computed_region_696y_nzzn': '43',
  'spaces': '2',
  'lat': '37.788072'}}
In [67]:
# Extract lat/lon data of bike racks
df6 = pd.DataFrame(columns=['lon','lat'])

for i in range(len(data['features'])):
    df6.loc[i] = data['features'][i]['geometry']['coordinates']
df6.head
Out[67]:
<bound method NDFrame.head of             lon        lat
0   -122.405330  37.788067
1   -122.420220  37.788628
2   -122.405640  37.789780
3   -122.392520  37.793137
4   -122.440060  37.729187
..          ...        ...
995 -122.417050  37.764256
996 -122.429405  37.767498
997 -122.419010  37.758390
998 -122.412610  37.791610
999 -122.442170  37.752163

[1000 rows x 2 columns]>
In [68]:
# display the dataframe
shape = [Point(xy) for xy in zip(df6['lon'],df6['lat'])]
gdf6 = gpd.GeoDataFrame(df6, crs=df1.crs, geometry=shape)
gdf6.head()
Out[68]:
lon lat geometry
0 -122.40533 37.788067 POINT (-122.40533 37.78807)
1 -122.42022 37.788628 POINT (-122.42022 37.78863)
2 -122.40564 37.789780 POINT (-122.40564 37.78978)
3 -122.39252 37.793137 POINT (-122.39252 37.79314)
4 -122.44006 37.729187 POINT (-122.44006 37.72919)
In [69]:
# plot the bike rack data
base = SF.plot(color='none', edgecolor='gray', linewidth=.1, figsize=(5,5))
gdf6.plot(ax=base, color='grey', markersize=.1)
Out[69]:
<AxesSubplot:>
In [70]:
# using spatial join and groupby operation to prepare data
# counting bike rack numbers within each census tract
c=gpd.sjoin(SF, gdf6).groupby('GEOID').count()
In [71]:
# Randomly pick 'STATEFP' and store the count value to a new column called 'bikerack_count'
c['bikerack_count']=c['STATEFP']
c.head(1)
Out[71]:
STATEFP COUNTYFP TRACTCE AFFGEOID NAME LSAD ALAND AWATER geometry tract ... ling pov white_pct african_am asian_amer sidewalk_length index_right lon lat bikerack_count
GEOID
6075010100 3 3 3 3 3 3 3 3 3 3 ... 3 3 3 3 3 3 3 3 3 3

1 rows × 28 columns

In [72]:
# merge the 'bikerack_count' and SF main dataset
SF=SF.merge(c['bikerack_count'],how='left',on='GEOID')
In [73]:
# fill up null values with zero to represent no bike rack found
SF['bikerack_count'].fillna(0, inplace=True)
In [74]:
# found that bike racks concentrate on the northeasten part of SF
SF.plot(column='bikerack_count')
Out[74]:
<AxesSubplot:>

3. Bike sharing station

In [75]:
# load bikesharing station data
url="https://data.sfgov.org/resource/7jbp-yzp3.json"

response = requests.get(url)
results = response.text
# data = json.loads(results)
data= pd.read_json(results)
In [76]:
# transform the data's latlon to float type 
# create a geodataframe 
fil = lambda x: float(x)
shape = [Point(xy) for xy in zip(data.station_longitude.map(fil), data.station_latitude.map(fil))]
gdf7 = gpd.GeoDataFrame(data, crs=df1.crs, geometry=shape)
gdf7.head(2)
Out[76]:
station_id station_name station_id_domo has_kiosk dock_count station_latitude station_longitude region_id shape geometry
0 17 Embarcadero BART Station (Beale St at Market St) SF-E29-3 true 29 37.792251 -122.397086 3.0 POINT (-122.39707 37.792248) POINT (-122.39709 37.79225)
1 30 San Francisco Caltrain (Townsend St at 4th St) SF-J29 true 19 37.776598 -122.395282 3.0 POINT (-122.39527 37.776592) POINT (-122.39528 37.77660)
In [77]:
# Drop missing data
gdf7=gdf7[gdf7.station_latitude!=0]
In [78]:
# display the result
# found bikesharing station concentrate northeastern part of SF, similar to bike rack facilities
base= SF.plot(color='none', edgecolor='gray', linewidth=.1, figsize=(4,4))
gdf7[gdf7["station_id_domo"].str.contains('SF')].plot(ax=base, markersize=.5)
Out[78]:
<AxesSubplot:>
In [79]:
# create a new column to store a value that represents the minimum value to the closest bike sharing station
SF['min_dist_to_bs'] = SF.geometry.apply(lambda g: gdf7[gdf7["station_id_domo"].str.contains('SF')].distance(g).min())
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: UserWarning: Geometry is in a geographic CRS. Results from 'distance' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  
In [80]:
SF.head(2)
Out[80]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... cvd edu ling pov white_pct african_am asian_amer sidewalk_length bikerack_count min_dist_to_bs
0 06 075 010200 1400000US06075010200 6075010200 102 CT 515958 295388 POLYGON ((-122.42667 37.80964, -122.42488 37.8... ... 4.39 1.6 1.1 12.1 81.099998 0.6 10.200000 0.035729 2.0 0.00000
1 06 075 011200 1400000US06075011200 6075011200 112 CT 177414 0 POLYGON ((-122.41629 37.79389, -122.41522 37.7... ... 3.71 10.6 13.1 26.0 55.099998 1.5 35.099998 0.008231 1.0 0.00264

2 rows × 27 columns

In [81]:
# display the result, accssibility to bike sharing station
SF.plot(column='min_dist_to_bs',legend=True)
Out[81]:
<AxesSubplot:>

Preparing data for GWR

In [82]:
SF.crs
Out[82]:
<Geographic 2D CRS: EPSG:4269>
Name: NAD83
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: North America - NAD83
- bounds: (167.65, 14.92, -47.74, 86.46)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich
In [83]:
SF_3857 = SF.to_crs("EPSG:3857")
SF_3857.crs
Out[83]:
<Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World - 85°S to 85°N
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
In [84]:
# create two columns 'x' and 'y' to store x,y coordinates using centroid function
SF_3857["x"]=SF_3857.centroid.map(lambda p: p.x)
SF_3857["y"]=SF_3857.centroid.map(lambda p: p.y)
SF_3857.head(2)
Out[84]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... ling pov white_pct african_am asian_amer sidewalk_length bikerack_count min_dist_to_bs x y
0 06 075 010200 1400000US06075010200 6075010200 102 CT 515958 295388 POLYGON ((-13628474.780 4552569.154, -13628274... ... 1.1 12.1 81.099998 0.6 10.200000 0.035729 2.0 0.00000 -1.362797e+07 4.551968e+06
1 06 075 011200 1400000US06075011200 6075011200 112 CT 177414 0 POLYGON ((-13627319.401 4550350.608, -13627199... ... 13.1 26.0 55.099998 1.5 35.099998 0.008231 1.0 0.00264 -1.362692e+07 4.550221e+06

2 rows × 29 columns

In [85]:
fig, ax = plt.subplots(figsize = (6, 6))

SF_3857.plot(ax=ax, **{'edgecolor': 'black', 'facecolor': 'white'})
SF_3857.centroid.plot(ax = ax, c = 'black', markersize=10)
plt.show()
In [86]:
#Create choropleth quantile maps of each variable (including three new physical activity variables) 
f,(ax1,ax2,ax3) = plt.subplots(3,5,figsize=(32,16))


# Health
SF_3857.plot(column='asthma', legend=True, ax=ax1[0],
        scheme="quantiles",  k=5, cmap='GnBu',edgecolor='black')
ax1[0].set_title("Asthma", fontsize=16)

SF_3857.plot(column='cvd', legend=True, ax=ax2[0],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax2[0].set_title("Cardiovascular disease", fontsize=16)

SF_3857.plot(column='lbw', legend=True, ax=ax3[0],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax3[0].set_title("Low birth weight", fontsize=16)


# Env
SF_3857.plot(column='diesel', legend=True, ax=ax1[1],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax1[1].set_title("Diesel cars", fontsize=16)

SF_3857.plot(column='traffic', legend=True, ax=ax2[1],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax2[1].set_title("Traffic density", fontsize=16)

SF_3857.plot(column='pm', legend=True, ax=ax3[1],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax3[1].set_title("PM2.5", fontsize=16)


# Socioeconomic
SF_3857.plot(column='edu', legend=True, ax=ax1[2],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax1[2].set_title("# people with education level below highschool", fontsize=16)

SF_3857.plot(column='ling', legend=True, ax=ax2[2],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax2[2].set_title("Linguistic barrier", fontsize=16)

SF_3857.plot(column='pov', legend=True, ax=ax3[2],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax3[2].set_title("Poverty", fontsize=16)

# Ethnicity
SF_3857.plot(column='white_pct', legend=True, ax=ax1[3],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax1[3].set_title("% white households", fontsize=16)

SF_3857.plot(column='asian_amer', legend=True, ax=ax2[3],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax2[3].set_title("% asian am households", fontsize=16)

SF_3857.plot(column='african_am', legend=True, ax=ax3[3],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax3[3].set_title("% african am households", fontsize=16)


# Physical activity
SF_3857.plot(column='sidewalk_length', legend=True, ax=ax1[4],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax1[4].set_title("Sidewalk length", fontsize=16)

SF_3857.plot(column='bikerack_count', legend=True, ax=ax2[4],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax2[4].set_title("# bike racks", fontsize=16)

SF_3857.plot(column='min_dist_to_bs', legend=True, ax=ax3[4],
        scheme='quantiles', cmap='GnBu', k=5, edgecolor='black')
ax3[4].set_title("Accssibility to bikesharing station", fontsize=16)





for i in range(5):
    ax1[i].axis('off')
    ax2[i].axis('off')
    ax3[i].axis('off')
    cx.add_basemap(ax1[i], crs=SF_3857.crs.to_string(), source=cx.providers.Stamen.TonerLite)
    cx.add_basemap(ax2[i], crs=SF_3857.crs.to_string(), source=cx.providers.Stamen.TonerLite)
    cx.add_basemap(ax3[i], crs=SF_3857.crs.to_string(), source=cx.providers.Stamen.TonerLite)

 
/opt/anaconda3/lib/python3.7/site-packages/mapclassify/classifiers.py:236: UserWarning: Warning: Not enough unique values in array to form k classes
  "Warning: Not enough unique values in array to form k classes", UserWarning
/opt/anaconda3/lib/python3.7/site-packages/mapclassify/classifiers.py:238: UserWarning: Warning: setting k to 1
  Warn("Warning: setting k to %d" % k_q, UserWarning)
/opt/anaconda3/lib/python3.7/site-packages/mapclassify/classifiers.py:236: UserWarning: Warning: Not enough unique values in array to form k classes
  "Warning: Not enough unique values in array to form k classes", UserWarning
/opt/anaconda3/lib/python3.7/site-packages/mapclassify/classifiers.py:238: UserWarning: Warning: setting k to 4
  Warn("Warning: setting k to %d" % k_q, UserWarning)

Results of SF-level correlation analysis including physical activity

In [87]:
dfData = SF[['traffic','diesel','pm','asthma','lbw','cvd','sidewalk_length', 'bikerack_count', 'min_dist_to_bs']].corr()
plt.subplots(figsize=(6, 6)) 

sns.heatmap(dfData, annot=True, vmax=1, square=False, cmap="RdBu", alpha=.9, linewidths=2, linecolor='white', cbar=True)
plt.show()

Global regression analaysis: steps and major findings

In [88]:
# First model - 12 variables with no interaction term

y, X = dmatrices('asthma ~ pm + traffic + diesel +edu +ling + pov + white_pct + african_am + asian_amer +sidewalk_length + bikerack_count + min_dist_to_bs', 
                 data=SF_3857, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 asthma   R-squared:                       0.685
Model:                            OLS   Adj. R-squared:                  0.665
Method:                 Least Squares   F-statistic:                     33.03
Date:                Fri, 11 Dec 2020   Prob (F-statistic):           1.90e-39
Time:                        12:22:27   Log-Likelihood:                -817.60
No. Observations:                 195   AIC:                             1661.
Df Residuals:                     182   BIC:                             1704.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          68.5708     53.229      1.288      0.199     -36.455     173.597
pm                  0.4203      5.834      0.072      0.943     -11.091      11.932
traffic             0.0025      0.003      0.945      0.346      -0.003       0.008
diesel              0.2006      0.064      3.143      0.002       0.075       0.327
edu                 0.3906      0.301      1.300      0.195      -0.202       0.984
ling               -0.3723      0.285     -1.308      0.193      -0.934       0.189
pov                 0.0130      0.145      0.089      0.929      -0.273       0.299
white_pct          -0.7298      0.161     -4.535      0.000      -1.047      -0.412
african_am          0.9759      0.214      4.558      0.000       0.553       1.398
asian_amer         -0.4839      0.159     -3.048      0.003      -0.797      -0.171
sidewalk_length   -70.5948     53.424     -1.321      0.188    -176.004      34.815
bikerack_count      0.3385      0.191      1.775      0.078      -0.038       0.715
min_dist_to_bs   -748.8162    414.108     -1.808      0.072   -1565.886      68.254
==============================================================================
Omnibus:                       44.648   Durbin-Watson:                   1.891
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               94.490
Skew:                           1.062   Prob(JB):                     3.03e-21
Kurtosis:                       5.668   Cond. No.                     3.25e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.25e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [89]:
SF_3857['bike_diesel_rack']=SF_3857['diesel']*SF_3857['bikerack_count']
In [90]:
SF_3857['bike_diesel_bshare']=SF_3857['diesel']*SF_3857['min_dist_to_bs']
In [91]:
# Second model - 14 variables with two interaction terms

y, X = dmatrices('asthma ~ pm + traffic + diesel +edu +ling + pov + white_pct + african_am + asian_amer +sidewalk_length + bikerack_count + min_dist_to_bs + bike_diesel_rack + bike_diesel_bshare', 
                 data=SF_3857, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 asthma   R-squared:                       0.696
Model:                            OLS   Adj. R-squared:                  0.672
Method:                 Least Squares   F-statistic:                     29.41
Date:                Fri, 11 Dec 2020   Prob (F-statistic):           3.32e-39
Time:                        12:22:27   Log-Likelihood:                -814.31
No. Observations:                 195   AIC:                             1659.
Df Residuals:                     180   BIC:                             1708.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             71.0737     52.690      1.349      0.179     -32.896     175.043
pm                    -0.6830      5.789     -0.118      0.906     -12.107      10.741
traffic                0.0026      0.003      0.972      0.333      -0.003       0.008
diesel                 0.3152      0.080      3.938      0.000       0.157       0.473
edu                    0.3331      0.298      1.117      0.266      -0.256       0.922
ling                  -0.4003      0.282     -1.418      0.158      -0.957       0.157
pov                   -0.0196      0.144     -0.136      0.892      -0.304       0.265
white_pct             -0.7500      0.160     -4.702      0.000      -1.065      -0.435
african_am             1.0210      0.213      4.794      0.000       0.601       1.441
asian_amer            -0.4255      0.159     -2.679      0.008      -0.739      -0.112
sidewalk_length      -84.4573     53.291     -1.585      0.115    -189.612      20.698
bikerack_count         1.7988      0.644      2.795      0.006       0.529       3.069
min_dist_to_bs       589.5027   1233.198      0.478      0.633   -1843.882    3022.888
bike_diesel_rack      -0.0177      0.007     -2.411      0.017      -0.032      -0.003
bike_diesel_bshare   -31.3196     28.579     -1.096      0.275     -87.712      25.072
==============================================================================
Omnibus:                       37.976   Durbin-Watson:                   1.920
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               65.281
Skew:                           1.001   Prob(JB):                     6.68e-15
Kurtosis:                       5.006   Cond. No.                     1.08e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.08e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [92]:
# Third model (result of backward stepwise regression)
y, X = dmatrices('asthma ~diesel + white_pct + african_am + asian_amer + bikerack_count + bike_diesel_rack', 
                 data=SF_3857, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 asthma   R-squared:                       0.680
Model:                            OLS   Adj. R-squared:                  0.670
Method:                 Least Squares   F-statistic:                     66.61
Date:                Fri, 11 Dec 2020   Prob (F-statistic):           6.35e-44
Time:                        12:22:27   Log-Likelihood:                -819.21
No. Observations:                 195   AIC:                             1652.
Df Residuals:                     188   BIC:                             1675.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           73.6118      9.983      7.374      0.000      53.919      93.305
diesel               0.3394      0.055      6.184      0.000       0.231       0.448
white_pct           -0.8715      0.107     -8.115      0.000      -1.083      -0.660
african_am           0.9537      0.189      5.050      0.000       0.581       1.326
asian_amer          -0.6537      0.118     -5.560      0.000      -0.886      -0.422
bikerack_count       1.6338      0.620      2.633      0.009       0.410       2.858
bike_diesel_rack    -0.0163      0.007     -2.305      0.022      -0.030      -0.002
==============================================================================
Omnibus:                       43.827   Durbin-Watson:                   1.917
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               84.189
Skew:                           1.089   Prob(JB):                     5.23e-19
Kurtosis:                       5.370   Cond. No.                     6.36e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.36e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Spatial autocorrelation and GWR analysis results:

Spatial autocorrelation

In [93]:
SF_3857.crs
SF_3857.reset_index(drop=True, inplace=True)
SF_3857.head(2)

# resetting value for caculating kernal weight
Out[93]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry ... white_pct african_am asian_amer sidewalk_length bikerack_count min_dist_to_bs x y bike_diesel_rack bike_diesel_bshare
0 06 075 010200 1400000US06075010200 6075010200 102 CT 515958 295388 POLYGON ((-13628474.780 4552569.154, -13628274... ... 81.099998 0.6 10.200000 0.035729 2.0 0.00000 -1.362797e+07 4.551968e+06 147.940002 0.000000
1 06 075 011200 1400000US06075011200 6075011200 112 CT 177414 0 POLYGON ((-13627319.401 4550350.608, -13627199... ... 55.099998 1.5 35.099998 0.008231 1.0 0.00264 -1.362692e+07 4.550221e+06 111.129997 0.293383

2 rows × 31 columns

In [94]:
w = ps.weights.Kernel.from_dataframe(SF_3857, k=5, fixed=False, function='gaussian')
plot_spatial_weights(w, SF_3857, figsize=(5,5));

# w = ps.weights.Queen.from_dataframe(SF_3857)
# plot_spatial_weights(w, SF_3857, figsize=(5,5));
In [95]:
# Global Moran's I
y, X = dmatrices('asthma ~ pm + traffic + diesel +edu +ling + pov + white_pct + african_am + asian_amer +sidewalk_length + bikerack_count + min_dist_to_bs', 
                 data=SF_3857, return_type='dataframe')

moran = esda.Moran(y, w)
moran.I
Out[95]:
0.8232597476169629
In [96]:
# Plotting the result
# A Moran's I of .82 is quite high
fig, ax = moran_scatterplot(moran, aspect_equal=True)
plt.show()
In [97]:
# calculating Moran_Local and plotting the result
moran_loc = esda.moran.Moran_Local(y, w)
fig, ax =  splot.esda.moran_scatterplot(moran_loc, p=0.05)
ax.set_xlabel('Asthma')
ax.set_ylabel('Spatial Lag of Asthma')
plt.show()
In [98]:
# plotting the result on the map
# two hotspots and one large coldspot can be found, which are very similar to the residential patterns of white and Asian households

moran_loc = esda.moran.Moran_Local(y,w)
# splot.esda.lisa_cluster(moran_loc, SF_3857, p=0.05, alpha=.9, figsize = (5,5))
splot.esda.plot_local_autocorrelation(moran_loc, SF_3857, 'asthma', p=0.01, quadrant=1)
plt.show()

GWR

In [99]:
#Prepare dataset inputs for GWR
g_y = SF_3857['asthma'].values.reshape((-1, 1))
g_X = SF_3857[['diesel',  'white_pct', 'african_am', 'asian_amer', 'bikerack_count','bike_diesel_rack']].values
u = SF_3857['x']
v = SF_3857['y']
g_coords = list(zip(u, v))
In [100]:
#Calibrate a GWR model for Chicago dataset using computationally selected~bandwidth (optimizes using AIC)
gwr_selector = Sel_BW(g_coords, g_y, g_X)
gwr_bw = gwr_selector.search()
print('Computationally selected bandwidth: ',gwr_bw)

gwr_model = GWR(g_coords, g_y, g_X, gwr_bw)
gwr_results = gwr_model.fit()
print('Residual sum of squares: ', gwr_results.resid_ss)
Computationally selected bandwidth:  83.0
Residual sum of squares:  34152.95836313484
In [101]:
gwr_results.summary();

# X1:'diesel', X2:'white_pct', X3:'african_am', X4:'asian_amer', X5:'bikerack_count', X6:'bike_diesel_rack'
===========================================================================
Model type                                                         Gaussian
Number of observations:                                                 195
Number of covariates:                                                     7

Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares:                                          50885.124
Log-likelihood:                                                    -819.215
AIC:                                                               1652.430
AICc:                                                              1655.204
BIC:                                                              49893.800
R2:                                                                   0.680
Adj. R2:                                                              0.670

Variable                              Est.         SE  t(Est/SE)    p-value
------------------------------- ---------- ---------- ---------- ----------
X0                                  73.612      9.983      7.374      0.000
X1                                   0.339      0.055      6.184      0.000
X2                                  -0.871      0.107     -8.115      0.000
X3                                   0.954      0.189      5.050      0.000
X4                                  -0.654      0.118     -5.560      0.000
X5                                   1.634      0.620      2.633      0.008
X6                                  -0.016      0.007     -2.305      0.021

Geographically Weighted Regression (GWR) Results
---------------------------------------------------------------------------
Spatial kernel:                                           Adaptive bisquare
Bandwidth used:                                                      83.000

Diagnostic information
---------------------------------------------------------------------------
Residual sum of squares:                                          34152.958
Effective number of parameters (trace(S)):                           28.079
Degree of freedom (n - trace(S)):                                   166.921
Sigma estimate:                                                      14.304
Log-likelihood:                                                    -780.339
AIC:                                                               1618.837
AICc:                                                              1629.444
BIC:                                                               1714.012
R2:                                                                   0.785
Adjusted R2:                                                          0.749
Adj. alpha (95%):                                                     0.012
Adj. critical t value (95%):                                          2.522

Summary Statistics For GWR Parameter Estimates
---------------------------------------------------------------------------
Variable                   Mean        STD        Min     Median        Max
-------------------- ---------- ---------- ---------- ---------- ----------
X0                       67.177     63.347    -90.596     58.075    167.398
X1                        0.251      0.211     -0.103      0.198      0.866
X2                       -0.717      0.584     -1.660     -0.644      0.966
X3                        0.515      0.897     -1.357      0.552      2.427
X4                       -0.388      0.672     -1.480     -0.348      1.153
X5                        1.115      1.807     -4.181      1.848      3.444
X6                       -0.001      0.031     -0.040     -0.014      0.104
===========================================================================

In [102]:
#Prepare GWR results for mapping
#Add GWR parameters to GeoDataframe
SF_3857['gwr_intercept'] = gwr_results.params[:,0]
SF_3857['gwe_diesel'] = gwr_results.params[:,1]
SF_3857['gwr_white'] = gwr_results.params[:,2]
SF_3857['gwr_african'] = gwr_results.params[:,3]
SF_3857['gwr_asian'] = gwr_results.params[:,4]
SF_3857['gwr_bikerack_count'] = gwr_results.params[:,5]

#Obtain t-vals filtered based on multiple testing correction
gwr_filtered_t = gwr_results.filter_tvals()
In [103]:
# Plot estimators
fig, (axes1, axes2) = plt.subplots(nrows=2, ncols=3, figsize=(12,6), constrained_layout=True)
ax0 = axes1[0]
ax0.set_title('GWR Intercept Surface (BW: ' + str(gwr_bw) +')', fontsize=10)
ax1 = axes1[1]
ax1.set_title('GWR diesel Surface (BW: ' + str(gwr_bw) +')', fontsize=10)
ax2 = axes1[2]
ax2.set_title('GWR white Surface (BW: ' + str(gwr_bw) +')', fontsize=10)
ax3 = axes2[0]
ax3.set_title('GWR african Surface (BW: ' + str(gwr_bw) +')', fontsize=10)
ax4 = axes2[1]
ax4.set_title('GWR asian Surface (BW: ' + str(gwr_bw) +')', fontsize=10)
ax5 = axes2[2]
ax5.set_title('GWR bike rack count (BW: ' + str(gwr_bw) +')', fontsize=10);



SF_3857.plot('gwr_intercept', legend=True, cmap='seismic', ax=ax0,  **{'edgecolor':'black', 'alpha':.65},legend_kwds={'shrink': .9})
if (gwr_filtered_t[:,0] == 0).any():
    SF_3857[gwr_filtered_t[:,0] == 0].plot(color='lightgrey', ax=ax0, **{'edgecolor':'black'})

SF_3857.plot('gwe_diesel', legend=True,cmap='seismic', ax=ax1,  **{'edgecolor':'black', 'alpha':.65},legend_kwds={'shrink': .9})
if (gwr_filtered_t[:,1] == 0).any():
    SF_3857[gwr_filtered_t[:,1] == 0].plot(color='lightgrey', ax=ax1, **{'edgecolor':'black'})
    
SF_3857.plot('gwr_white', legend=True, cmap='seismic', ax=ax2,  **{'edgecolor':'black', 'alpha':.65},legend_kwds={'shrink': .9})
if (gwr_filtered_t[:,2] == 0).any():
    SF_3857[gwr_filtered_t[:,2] == 0].plot(color='lightgrey', ax=ax2, **{'edgecolor':'black'})

SF_3857.plot('gwr_african', legend=True, cmap='seismic', ax=ax3,  **{'edgecolor':'black', 'alpha':.65},legend_kwds={'shrink': .9})
if (gwr_filtered_t[:,3] == 0).any():
    SF_3857[gwr_filtered_t[:,3] == 0].plot(color='lightgrey', ax=ax3, **{'edgecolor':'black'})

SF_3857.plot('gwr_asian', legend=True, cmap='seismic', ax=ax4,  **{'edgecolor':'black', 'alpha':.65},legend_kwds={'shrink': .9})
if (gwr_filtered_t[:,4] == 0).any():
    SF_3857[gwr_filtered_t[:,4] == 0].plot(color='lightgrey', ax=ax4, **{'edgecolor':'black'})
    
SF_3857.plot('gwr_bikerack_count', legend=True, cmap='seismic', ax=ax5,  **{'edgecolor':'black', 'alpha':.65},legend_kwds={'shrink': .9})
if (gwr_filtered_t[:,5] == 0).any():
    SF_3857[gwr_filtered_t[:,5] == 0].plot(color='lightgrey', ax=ax5, **{'edgecolor':'black'});

    
ax0.get_xaxis().set_visible(False)
ax0.get_yaxis().set_visible(False)

ax1.get_xaxis().set_visible(False)
ax1.get_yaxis().set_visible(False)

ax2.get_xaxis().set_visible(False)
ax2.get_yaxis().set_visible(False)

ax3.get_xaxis().set_visible(False)
ax3.get_yaxis().set_visible(False)

ax4.get_xaxis().set_visible(False)
ax4.get_yaxis().set_visible(False)

ax5.get_xaxis().set_visible(False)
ax5.get_yaxis().set_visible(False);
In [104]:
#Local model fit
SF_3857['R2'] = gwr_results.localR2
SF_3857.plot('R2', legend = True, figsize=(5,5),legend_kwds={'shrink': .8})
ax = plt.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
#plt.savefig('local_R2')
plt.show()
In [105]:
VIF = gwr_results.local_collinearity()[1]
VDP = gwr_results.local_collinearity()[3]

names = ['Diesel','White','African','Asian','Bike','interaction']

fig, ax = plt.subplots(1, 6, figsize = (20, 5))
for col in range(6):
    SF_3857['vif'] = VIF[:, col]
    SF_3857.plot('vif', ax = ax[col], legend = True, legend_kwds={'shrink': .5})
    ax[col].set_title('VIF: ' + names[col])
    ax[col].get_xaxis().set_visible(False)
    ax[col].get_yaxis().set_visible(False)
    

fig, ax = plt.subplots(1, 6, figsize = (20, 5))
for col in range(6):
    SF_3857['vdp'] = VDP[:, col]
    SF_3857.plot('vdp', ax = ax[col], legend = True, legend_kwds={'shrink': .5})
    ax[col].set_title('VDP: ' + names[col])
    ax[col].get_xaxis().set_visible(False)
    ax[col].get_yaxis().set_visible(False)

PART 3: Interactive mapping with Pydeck, OSMNX, Folium

1. Pydeck - grid, heatmap, trip layers

2. OSM network analysis and Folium maps

In [106]:
# Display bike rack data using Gridlayer from Mapbox

bike_rack = pdk.Layer(
    'GridLayer',
    df6,
    get_position=['lon', 'lat'],
    auto_highlight=True,
    elevation_scale=150,
    pickable=True,
    elevation_range=[0, 30],
    extruded=True,
    opacity=.3,
    coverage=.7)

view_state = pdk.ViewState(
    longitude=-122.415,
    latitude=37.8023,
    zoom=9,
    min_zoom=1,
    max_zoom=15,
    pitch=50,
    bearing=30)

r = pdk.Deck(layers=[bike_rack], initial_view_state=view_state)
r.to_html('Grid.html')
Out[106]:
In [107]:
# Display bike sharing station data using Heatmaplayer from Mapbox



view = pdk.ViewState(
    longitude=-122.455,
    latitude=37.7423,
    zoom=11,
    min_zoom=1,
    max_zoom=15)


COLOR_SCALE = [
    [90, 249, 232],
    [70, 204, 197],
    [55, 150, 150],
    [20, 100, 100],
]

bike_sharing = pdk.Layer(
    "HeatmapLayer",
    data=gdf7,
    opacity=0.9,
    get_position=["station_longitude", "station_latitude"],
    aggregation='"MEAN"',
    color_range=COLOR_SCALE,
    threshold=1,
    get_weight="dock_count",
    pickable=True,
)




r = pdk.Deck(
    layers=bike_sharing,
    initial_view_state=view,
    tooltip={"text": "Concentration of bike-sharing decks in orange"},
)

r.to_html("heatmap_layer.html")
Out[107]:
In [108]:
# Create a osmnx-graph called G_bike to represent bike network and displaying it

G_bike = ox.graph.graph_from_bbox(37.86,37.7,-122.325,-122.55,network_type='bike')
ox.plot.plot_graph(G_bike,ax=None, figsize=(8, 8), bgcolor='#111111', 
                   node_color='w', node_size=.5, node_alpha=None, node_edgecolor='none', node_zorder=1, 
                   edge_color='#999999', edge_linewidth=.1, edge_alpha=None, bbox=None)
Out[108]:
(<Figure size 576x576 with 1 Axes>, <AxesSubplot:>)
In [109]:
# input data for origin and destination
# here I assume that I am going to "Taiwan Restaurant, SF" from "Hunter's Point, SF"

origin_input = input()
destination_input = input()
Hunter's Point, SF
Taiwan Restaurant, SF
In [110]:
# using geocoder to transform the string-type address to latlon type coordinates
origin = ox.geocoder.geocode(origin_input)
destination = ox.geocoder.geocode(destination_input) 
In [111]:
# showing origin's latlon
origin 
Out[111]:
(37.7267715, -122.3715718)
In [112]:
# showing destination's latlon
destination
Out[112]:
(37.7828379, -122.464162)
In [113]:
# calculting the distance between the OD pair
ox.distance.euclidean_dist_vec(origin[0], origin[1], destination[0], destination[1])
Out[113]:
0.10824225766769899
In [114]:
# getting the nearest edge ID for origin and destination
ii_bike=ox.distance.get_nearest_edge(G_bike, origin)
jj_bike=ox.distance.get_nearest_edge(G_bike, destination)
In [115]:
# showing edge ID
ii_bike
Out[115]:
(65320592, 65322415, 0)
In [116]:
# showing edge ID
jj_bike
Out[116]:
(65287668, 65302772, 0)
In [117]:
# showing edge IDs along the shortest path for the OD pair
path_bike = ox.distance.shortest_path(G_bike, ii_bike[0], jj_bike[0], weight='length')
path_bike[:10]
Out[117]:
[65320592,
 65320588,
 65320580,
 65340028,
 65366369,
 65355023,
 6923676856,
 6923676854,
 65357496,
 5443154347]
In [118]:
# the total count of the edges on the shortest path
len(path_bike)
Out[118]:
193
In [119]:
# displaying all the info for all edges
edges = ox.utils_graph.get_route_edge_attributes(G_bike, path_bike, attribute=None, minimize_key='length', retrieve_default=None)
edges[:2]
Out[119]:
[{'osmid': 8917551,
  'name': 'Navy Road',
  'highway': 'residential',
  'oneway': False,
  'length': 66.526,
  'geometry': <shapely.geometry.linestring.LineString at 0x1a68df0110>},
 {'osmid': 8917551,
  'name': 'Navy Road',
  'highway': 'residential',
  'oneway': False,
  'length': 23.156,
  'geometry': <shapely.geometry.linestring.LineString at 0x1a68df0610>}]
In [120]:
# Mapping the shortest route and diesel PM onto the folium platform

diesel_map = folium.Map([37.7556, -122.4399], zoom_start = 12)
tracts = SF.set_index('TRACTCE')
SF_wgs84 = SF.to_crs({'init': 'epsg:4326'})


folium.Choropleth(
    geo_data=SF,
    name='choropleth',
    data=SF,
    columns=['TRACTCE','diesel'],
    key_on = 'feature.properties.{}'.format('TRACTCE'),   
    fill_color='YlOrBr',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Diesel Emission'
).add_to(diesel_map)

route_diesel_map=ox.folium.plot_route_folium(G_bike, path_bike, route_map=diesel_map, fit_bounds=False,
                              popup_attribute=None,  route_color='#5e8277', route_width=5, route_opacity=.9)
route_diesel_map
/opt/anaconda3/lib/python3.7/site-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
Out[120]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [121]:
# using nodeget function to derive the latlon for each edge and store them in a list called node_lst
# i.e. transforming edge ID "back" to latlon format

api = osm.OsmApi() 

node_lst=[]
for i in range(len(path_bike)):
    node_lst.append(api.NodeGet(path_bike[i]))

node_lst[:2]
Out[121]:
[{'id': 65320592,
  'visible': True,
  'version': 2,
  'changeset': 2102778,
  'timestamp': datetime.datetime(2009, 8, 10, 21, 59, 29),
  'user': 'woodpeck_fixbot',
  'uid': 147510,
  'lat': 37.727563,
  'lon': -122.371273,
  'tag': {}},
 {'id': 65320588,
  'visible': True,
  'version': 2,
  'changeset': 2102778,
  'timestamp': datetime.datetime(2009, 8, 10, 21, 59, 29),
  'user': 'woodpeck_fixbot',
  'uid': 147510,
  'lat': 37.727748,
  'lon': -122.371954,
  'tag': {}}]
In [122]:
# create a list called node_latlon to store latlon data extracted from node_lst
node_latlon=[]

for i in range(len(node_lst)):
    node_latlon.append(list([node_lst[i]["lon"], node_lst[i]["lat"]]))
In [123]:
type(node_latlon)
Out[123]:
list
In [124]:
# check the data format is correct as expected
node_latlon[0]
Out[124]:
[-122.371273, 37.727563]
In [125]:
# create a dataframe called d, with two columns which will be filled with coordiniate and timestamp data
# transform node_latlon to pd.Series
d = pd.DataFrame(columns=['coordinates','timestamps'])
pd.Series(node_latlon)
Out[125]:
0        [-122.371273, 37.727563]
1        [-122.371954, 37.727748]
2      [-122.3722079, 37.7277957]
3      [-122.3724341, 37.7282512]
4      [-122.3725856, 37.7283456]
                  ...            
188    [-122.4604941, 37.7812627]
189    [-122.4610901, 37.7812355]
190    [-122.4612247, 37.7830976]
191    [-122.4622971, 37.7830489]
192    [-122.4633696, 37.7830001]
Length: 193, dtype: object
In [126]:
# randomly insert 1 in the first row of d
# inserting latlon data into the column 'coordinates' of d 
d.loc[0]=1
d['coordinates'][0]=node_latlon
d['coordinates']
Out[126]:
0    [[-122.371273, 37.727563], [-122.371954, 37.72...
Name: coordinates, dtype: object
In [127]:
# assuming that each travel step takes 1 sec
# here, I did not use the "actual" estimate of travel time as I was not able to figure out how, 
# so I just used 1, 2, 3 etc to represent the travel timestampstime_lst=[]
time_lst=[]
for i in range(len(node_latlon)):
    time_lst.append(i)

d['timestamps'][0]=time_lst
d
Out[127]:
coordinates timestamps
0 [[-122.371273, 37.727563], [-122.371954, 37.72... [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
In [128]:
# mapping the shortest route onto another platform, Mapbox

layer = pdk.Layer(
    "TripsLayer",
    d,
    get_path="coordinates",
    get_timestamps="timestamps",
    get_color=[200, 50, 150],
    opacity=1,
    width_min_pixels=10,
    rounded=False,
    trail_length=500,
    current_time=300,
)

view_state = pdk.ViewState(latitude=37.7749295, longitude=-122.4194155, zoom=10, bearing=50, pitch=45)

r = pdk.Deck(layers=[layer,bike_rack,bike_sharing], initial_view_state=view_state)
# r = pdk.Deck(layers=layer, initial_view_state=view_state)

r.to_html("trips_layer.html")
Out[128]:

Thanks!